import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import tree
import graphviz
%matplotlib inline
df = pd.read_csv('assessment_data.csv', dtype={0 :'str'})
# df.head(5)
# changing column name/case for better handling
df.columns = df.columns.str.lower()
df.rename(columns={'totaltransactionrevenue': 'transaction_amount', 'devicecategory': 'device',
'operatingsystem': 'os', 'channelgrouping': 'channel'}, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 464439 entries, 0 to 464438 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fullvisitorid 464439 non-null object 1 visitnumber 464439 non-null int64 2 date 464439 non-null int64 3 visitstarttime 464439 non-null int64 4 bounces 232178 non-null float64 5 pageviews 464387 non-null float64 6 timeonsite 231612 non-null float64 7 transaction_amount 6308 non-null float64 8 transactions 6308 non-null float64 9 source 464439 non-null object 10 medium 464439 non-null object 11 campaign 464439 non-null object 12 device 464439 non-null object 13 os 464439 non-null object 14 mobiledevicemodel 464439 non-null object 15 city 464439 non-null object 16 channel 464439 non-null object dtypes: float64(5), int64(3), object(9) memory usage: 60.2+ MB
The df certainly uses a lot of memory, changing dtypes wherever possible and removing columns that are inconsistent or not useful for us
It can also be seen that columns[4:9] contains a lot of null values
# df['mobiledevicemodel'].unique()
# df[(df['city'] == 'not available in demo dataset') & (df['transaction_amount'] > 0)]
df.drop(columns=['mobiledevicemodel', 'city'], inplace=True)
# removing rows that have inconsistent values in columns with dtype=object
# df['os'].unique()
os_null_index = df[(df['os'] == '(not set)')].index
# df[df['campaign'] == '(not set)']
df.drop(index=os_null_index, columns='campaign', inplace=True)
sr_null_index = df[(df['source'] == '(not set)')].index
df.drop(sr_null_index, inplace=True)
md_null_index = df[df['medium'] == '(not set)'].index
df.drop(md_null_index, inplace=True)
def check_null_per(d_set):
for col in d_set.columns:
val = np.mean(d_set[col].isnull())
if val == 0: continue # ignoring columns that dont have missing values
print(f'{col}\t-\t{val}% null values')
check_null_per(df)
bounces - 0.5011851883951206% null values pageviews - 0.00011266873226009143% null values timeonsite - 0.5001603362728316% null values transaction_amount - 0.9863345828006846% null values transactions - 0.9863345828006846% null values
# checking columns with null values and replacing with appropriate values
#df['bounces'].unique()
df['bounces'].fillna(0, inplace=True)
dr_row = df[df['pageviews'].isnull()].index
df.drop(dr_row, inplace=True)
avg_time = np.mean(df['timeonsite']).round()
df['timeonsite'].fillna(avg_time, inplace=True)
# it is safe to assume that if a transaction has not been made then the transaction revenue amount = 0
# the same goes for transactions column
df[(df['transaction_amount'].isnull())]
df['transaction_amount'].fillna(0, inplace=True)
df['transactions'].fillna(0, inplace=True)
# df[['bounces', 'pageviews', 'timeonsite', 'transactions', 'transaction_amount']] =
# df[['bounces', 'pageviews', 'timeonsite','transactions','transaction_amount']].astype('int64')
def change_col_dtype(d_set, col_list):
for col in col_list:
if d_set[col].dtype == 'float64':
d_set[col] = d_set[col].astype('int64')
elif d_set[col].dtype == 'object':
d_set[col] = d_set[col].astype('category')
label_cat_code = dict(enumerate(d_set[col].cat.categories))
if len(label_cat_code) > 21:
print(f"\n{col} column data consists large number of unique labels --not printing\n")
else:
print(f"{col}\t-\t",label_cat_code,'\n')
d_set[col] = d_set[col].cat.codes
change_col_dtype(df, ['bounces', 'pageviews', 'timeonsite', 'transactions', 'transaction_amount'])
# lets get the month out of date
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
# df['date'].dt.month.unique()
df.rename(columns={'visitstarttime': 'visit_time'}, inplace=True)
df['visit_time'] = pd.to_datetime(df['visit_time'], unit='s').dt.hour
# df['channel'].unique()
df['channel'].replace('Organic S', 'Organic Search', inplace=True)
# we can see months from date for graphs such as:
# on which of the months the store items were transacted the most for the year 2017
plt.figure(1, figsize=(12,8))
plt.style.use('seaborn')
sns.lineplot(data=df, x='date', y='transaction_amount', color='#191970')
plt.xlabel('Months', labelpad=15, size=15)
plt.ylabel('Transaction Amount (10^7)', labelpad=15, size=15)
plt.title('Transactions amount by Month', size=20, pad=11)
plt.xticks(size=11)
plt.yticks(size=11)
plt.show()
Our data consists of months 1-8 (i.e, January to August) for the year 2017.
From above we can see that, compared to the other months, higher amount of transactions were made between 4 & 5 (i.e, April and May).
# we can also check what hour of the data the store is visited the most
# for this we can bin the hours as morning, afternoon, evening, night/latenight
hour = pd.DataFrame(df['visit_time'])
def day_hour(x):
if (x>=5) & (x<=11):
return 'Morning'
elif (x>=12) & (x<=16):
return 'Afternoon'
elif (x>=17) & (x<=20):
return 'Evening'
else: return 'Night / Late Night'
hour['bin'] = hour['visit_time'].apply(day_hour)
plt.figure(2, figsize=(12,8))
plt.style.use('seaborn')
sns.countplot(y=hour['bin'].values, palette='Blues_r_d')
plt.title("Visits by Day's Hour", pad=15, fontsize=16, fontweight='bold')
plt.xlabel('Visit Count', size=15, labelpad=15, fontweight='bold')
plt.xticks(size=12)
plt.yticks(size=13)
plt.show()
# Morning - contains visitors from 5am - 11am
# Afternoon - contains visitors from 12pm - 16pm
# Evening - contains visitors from 17pm - 20pm
# Night/Late night - contains visitors from 21pm - 4am
It seems like a large number of visitors, visit the site during Night time or Late night (i.e, from 9pm - 4am)
hour[['transaction_amount', 'transactions']] = df[['transaction_amount', 'transactions']]
# lets see which bin has higher amounts of transactions made
plt.figure(3, figsize=(12,8))
plt.style.use('ggplot')
sns.barplot(data=hour, x='bin', y='transaction_amount', palette='magma')
plt.title("Transactions amount by hour", size=17, fontweight='bold', pad=20)
plt.yticks(size=13)
plt.xticks(size=14)
plt.xlabel('')
plt.ylabel('Transaction Amount (10^6)', labelpad=15, size=16, fontweight='bold')
# let's see at what time most transactions as per the bins
plt.figure(4, figsize=(12,8))
plt.style.use('ggplot')
sns.barplot(data=hour, x='bin', y='transactions', palette='magma')
plt.title("Transactions made by hour", size=17, fontweight='bold', pad=20)
plt.yticks(size=13)
plt.xticks(size=14)
plt.xlabel('')
plt.ylabel('Transactions made', labelpad=15, size=16, fontweight='bold')
plt.show()
Surprising, as even though most of the store visits were done during the night/latenight slot, as we can see above most of the transactions (as well as higher amount transactions) are made during Evening.
Though, the transactions made during night/latenight are on the higher side as well, the transactional amounts are low compared to evening and afternoon.
# lets find the reason
hour['bounces'] = df['bounces'].copy()
sns.catplot(data=hour, x='bin', col='bounces', kind='count', col_wrap=2)
plt.show()
I assumed the reason why transactions during the night/late night were low even though having the most visit count rate was because the visitors exited the site without triggering any sessions. (i.e, only a single page was visited)
Although, that does not seem the case.
Along with this it can be seen that Evenings have an overall lower bounce count.
# lets get in requried data in hour and see the correlation and prune features which are least correlated
feat = ['pageviews','date', 'source', 'medium', 'channel', 'os', 'device']
hour[feat] = df[feat].copy()
# before we check the correlation we will change the object dtype to category
change_col_dtype(hour, ['bin', 'source', 'medium', 'channel', 'os', 'device'])
bin - {0: 'Afternoon', 1: 'Evening', 2: 'Morning', 3: 'Night / Late Night'}
source column data consists large number of unique labels --not printing
medium - {0: '(none)', 1: 'affiliate', 2: 'cpc', 3: 'cpm', 4: 'organic', 5: 'referral'}
channel - {0: '(Other)', 1: 'Affiliates', 2: 'Direct', 3: 'Display', 4: 'Organic Search', 5: 'Paid Search', 6: 'Referral', 7: 'Social'}
os - {0: 'Android', 1: 'BlackBerry', 2: 'Chrome OS', 3: 'Firefox OS', 4: 'FreeBSD', 5: 'Linux', 6: 'Macintosh', 7: 'NTT DoCoMo', 8: 'Nintendo 3DS', 9: 'Nintendo Wii', 10: 'Nintendo WiiU', 11: 'OpenBSD', 12: 'Samsung', 13: 'Windows', 14: 'Windows Phone', 15: 'Xbox', 16: 'iOS'}
device - {0: 'desktop', 1: 'mobile', 2: 'tablet'}
df_corr = hour.corr()
plt.figure(figsize=(12,8))
sns.heatmap(df_corr, annot=True)
plt.show()
Surprising, the transactions made and pageviews are correlated, hence will we put pageviews in our model to see if visitor will transact.
Similarly, source & medium are correlated the most. This is because medium is grouping for source and channel is a grouping for source & medium both combined, yet the correlation factor is less for channel & medium as compared to channel & source.
# for our final graph lets see through which device what channels are accessed
dev_ch = pd.pivot_table(df[['device', 'channel']], index='channel',
columns=['device'], aggfunc=len)
dev_ch.replace(np.nan, 0, inplace=True)
fig = px.bar(data_frame=np.round(dev_ch.astype('int64')), barmode='stack', orientation='h',
title='Visit count by channel groupings through device', labels={'value': 'Visit Count'})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
dev_ch_transact = pd.pivot_table(df[['device', 'channel', 'transaction_amount']], index='channel',
columns=['device'], values='transaction_amount', aggfunc=np.mean)
dev_ch_transact.replace(np.nan, 0, inplace=True)
fig = px.bar(data_frame=np.round(dev_ch_transact.astype('int64')/(10^6)), orientation='h',
barmode='stack', title='Transaction amount by Channel groupings through device',
labels={'value': 'Transaction Amount'})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
From the first graph it can be seen that the site is being visited the most through Organic Search and that most of the channels are accessed from a desktop.
Although, in the second graph there is an unexpected finding that even though the visit count is less for Display, through any device, the transactions made/amount is most when the traffic comes from Display channel and are accessed from desktop.
# creating a new df before starting with the decision tree with required features only
# let's look at the features we are going to select once again
tree_data = hour.copy()
tree_data['date'] = tree_data['date'].dt.month
tree_data.rename(columns={'bin': 'day_hour'})
# removing transactions amount from variable as it will not be present for new visitor
X = tree_data.drop(columns=['transactions', 'transaction_amount'])
y = tree_data['transactions']
var = SelectKBest(score_func=chi2, k=10)
best_fit = var.fit(X,y)
dfscores = pd.DataFrame(best_fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
features = pd.concat([dfcolumns,dfscores],axis=1)
features.columns = ['feat','score']
print(features.nlargest(10, 'score')) # looking at top 10 features that are closest to the target
feat score 3 pageviews 984301.000572 5 source 139301.730161 6 medium 4505.904226 2 bounces 3189.976791 8 os 2198.352867 0 visit_time 1489.228448 9 device 1195.170502 7 channel 200.540407 4 date 88.517131 1 bin 12.704489
We will take the first 7 and let's check the accuracy
# seperating target variable
tree_data['transactions'] = np.where(tree_data['transactions'] >=1 , 1, 0)
X = tree_data[['pageviews', 'medium', 'bounces', 'os', 'visit_time', 'device']]
y = tree_data['transactions']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.03)
clf = DecisionTreeClassifier(max_depth=4, random_state=10000, min_impurity_decrease=0)
clf.fit(X_train, y_train)
# plt.figure(figsize=(20,10))
# tree.plot_tree(clf, feature_names=X.columns,filled=True, rounded=True)
# plt.show()
diag = tree.export_graphviz(clf, out_file=None, feature_names=X.columns,rotate=True,
special_characters=True, filled=True, rounded=True, precision=2)
graph = graphviz.Source(diag)
# graph.render(filename='decision_tree', format='png')
graph
One noticable thing, which I assume is prominent, is that as the pageviews decrease (lower than 8) The sample gets purer hence suggesting that the visitor's chances of transacting is low.
On the other hand as the pageviews increase (around 23 - 25) the sample impurity rises, hence suggesting that the chances of visitors transacting, rises.
# predict if the person is going to transact
y_pred = clf.predict(X_test)
y_pred.sum()
print("Accuracy Score of the model\t-\t", accuracy_score(y_test, y_pred)*100)
Accuracy Score of the model - 98.69989165763813